#install.packages("wesanderson")
# Load
library(wesanderson)
## Warning: package 'wesanderson' was built under R version 4.1.3
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
## Warning: package 'forecast' was built under R version 4.1.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Loading required package: lattice
library(rpart)
## Warning: package 'rpart' was built under R version 4.1.2
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.2

R Markdown

##loading csv files

train <-read.csv("D:/Projects/DemandForecasting/train_GzS76OK/train.csv")
fulfilment <-read.csv("D:/Projects/DemandForecasting/train_GzS76OK/fulfilment_center_info.csv")
meal <-read.csv("D:/Projects/DemandForecasting/train_GzS76OK/meal_info.csv")
test <-read.csv("D:/Projects/DemandForecasting/test_QoiMO9B.csv")

###Merging the files

Train1<-merge(train,fulfilment,by='center_id')
trainfull <- merge(Train1,meal, by='meal_id')
trainfull
Test1<-merge(test,fulfilment,by='center_id')
testfull <- merge(Test1,meal, by='meal_id')
testfull

###Exploratory Data Analysis ##Summary Statistics

summary(trainfull)
##     meal_id       center_id            id               week       
##  Min.   :1062   Min.   : 10.00   Min.   :1000000   Min.   :  1.00  
##  1st Qu.:1558   1st Qu.: 43.00   1st Qu.:1124999   1st Qu.: 39.00  
##  Median :1993   Median : 76.00   Median :1250184   Median : 76.00  
##  Mean   :2024   Mean   : 82.11   Mean   :1250096   Mean   : 74.77  
##  3rd Qu.:2539   3rd Qu.:110.00   3rd Qu.:1375140   3rd Qu.:111.00  
##  Max.   :2956   Max.   :186.00   Max.   :1499999   Max.   :145.00  
##  checkout_price     base_price     emailer_for_promotion homepage_featured
##  Min.   :  2.97   Min.   : 55.35   Min.   :0.00000       Min.   :0.0000   
##  1st Qu.:228.95   1st Qu.:243.50   1st Qu.:0.00000       1st Qu.:0.0000   
##  Median :296.82   Median :310.46   Median :0.00000       Median :0.0000   
##  Mean   :332.24   Mean   :354.16   Mean   :0.08115       Mean   :0.1092   
##  3rd Qu.:445.23   3rd Qu.:458.87   3rd Qu.:0.00000       3rd Qu.:0.0000   
##  Max.   :866.27   Max.   :866.27   Max.   :1.00000       Max.   :1.0000   
##    num_orders        city_code      region_code    center_type       
##  Min.   :   13.0   Min.   :456.0   Min.   :23.00   Length:456548     
##  1st Qu.:   54.0   1st Qu.:553.0   1st Qu.:34.00   Class :character  
##  Median :  136.0   Median :596.0   Median :56.00   Mode  :character  
##  Mean   :  261.9   Mean   :601.6   Mean   :56.61                     
##  3rd Qu.:  324.0   3rd Qu.:651.0   3rd Qu.:77.00                     
##  Max.   :24299.0   Max.   :713.0   Max.   :93.00                     
##     op_area        category           cuisine         
##  Min.   :0.900   Length:456548      Length:456548     
##  1st Qu.:3.600   Class :character   Class :character  
##  Median :4.000   Mode  :character   Mode  :character  
##  Mean   :4.084                                        
##  3rd Qu.:4.500                                        
##  Max.   :7.000
summary(testfull)
##     meal_id       center_id           id               week      
##  Min.   :1062   Min.   : 10.0   Min.   :1000085   Min.   :146.0  
##  1st Qu.:1558   1st Qu.: 43.0   1st Qu.:1123969   1st Qu.:148.0  
##  Median :1993   Median : 76.0   Median :1247296   Median :150.0  
##  Mean   :2032   Mean   : 81.9   Mean   :1248476   Mean   :150.5  
##  3rd Qu.:2569   3rd Qu.:110.0   3rd Qu.:1372971   3rd Qu.:153.0  
##  Max.   :2956   Max.   :186.0   Max.   :1499996   Max.   :155.0  
##  checkout_price     base_price      emailer_for_promotion homepage_featured
##  Min.   :  67.9   Min.   :  89.24   Min.   :0.00000       Min.   :0.00000  
##  1st Qu.: 214.4   1st Qu.: 243.50   1st Qu.:0.00000       1st Qu.:0.00000  
##  Median : 320.1   Median : 321.13   Median :0.00000       Median :0.00000  
##  Mean   : 341.9   Mean   : 356.49   Mean   :0.06644       Mean   :0.08136  
##  3rd Qu.: 446.2   3rd Qu.: 455.93   3rd Qu.:0.00000       3rd Qu.:0.00000  
##  Max.   :1113.6   Max.   :1112.62   Max.   :1.00000       Max.   :1.00000  
##    city_code      region_code    center_type           op_area     
##  Min.   :456.0   Min.   :23.00   Length:32573       Min.   :0.900  
##  1st Qu.:556.0   1st Qu.:34.00   Class :character   1st Qu.:3.600  
##  Median :596.0   Median :56.00   Mode  :character   Median :4.000  
##  Mean   :601.5   Mean   :56.71                      Mean   :4.088  
##  3rd Qu.:651.0   3rd Qu.:77.00                      3rd Qu.:4.500  
##  Max.   :713.0   Max.   :93.00                      Max.   :7.000  
##    category           cuisine         
##  Length:32573       Length:32573      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Removing Outlier

max(trainfull$num_orders)
## [1] 24299
trainf<-subset(trainfull, num_orders!=24299)

###Checking for Null values

missing_values<-summarise_all(trainf,funs(sum(is.na(.))/n()))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
missing_values<-list(missing_values,key="feature",value="MissingPoints")

###Visualizing Number of Orders

plot(trainf$checkout_price,trainf$num_orders, col= "blue", main="Number of Orders Processed Vs CheckoutPrice")

###Visualizing Number of Orders vsWeek numbers

plot(trainf$week,trainf$num_orders, col="red",main="No of Orders processed vs Week numbers")

###Visualizing Number of Orders vs Meal id

plot(trainf$meal_id,trainf$num_orders,col='dark Green', main="No of Orders procesed vs Meal id")

###Visualizing Number of Orders vs Center id

plot(trainf$center_id,trainf$num_orders,col='blue', main="No of Orders processed vs Center id")

###Visualizing Number of Orders vs emailer for promotion

g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=emailer_for_promotion,y=num_orders,fill=emailer_for_promotion))
g<-g+xlab("Emailer For promotion")+ylab("Number of orders")+theme(legend.position = "none")+scale_y_continuous(name="Number of orders", labels = scales::comma)
g 

###Visualizing Number of Orders vs Homepage featured

g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=homepage_featured, y= num_orders,fill=homepage_featured))
g<-g+xlab("Home Page featured")+ylab("Number of orders")+theme(legend.position = "none")
g+scale_y_continuous(name="Number of orders", labels = scales::comma)

###Visualizing Number of Orders vs Category

g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=category, y= num_orders,fill=category))
g<-g+xlab("Category")+ylab("Number of orders")+theme(legend.position = "none")
g+scale_y_continuous(name="Number of orders", labels = scales::comma)

###Visualizing Number of Orders vs Cuisine

g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=cuisine, y= num_orders,fill=cuisine))
g<-g+xlab("Cuisine")+ylab("Number of orders")+theme(legend.position = "none")
g+scale_y_continuous(name="Number of orders", labels = scales::comma)

###Visualizing Number of Orders vs Center_type

g<-ggplot(data=trainf)
g<-g+geom_bar(stat="identity",aes(x=center_type, y= num_orders,fill=center_type))
g<-g+xlab("Center_type")+ylab("Number of orders")+theme(legend.position = "none")
g+scale_y_continuous(name="Number of units", labels = scales::comma)

###Correlation matrix

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
M<-cor(trainf[, unlist(lapply(trainf, is.numeric))],method="pearson")
M
##                             meal_id    center_id            id          week
## meal_id                1.0000000000  0.009893983  4.869528e-04  0.0198162535
## center_id              0.0098939827  1.000000000  2.643985e-03 -0.0034529968
## id                     0.0004869528  0.002643985  1.000000e+00  0.0022303549
## week                   0.0198162535 -0.003452997  2.230355e-03  1.0000000000
## checkout_price         0.0107490579  0.001345637  1.944697e-03  0.0265774054
## base_price             0.0026054232  0.000603923  2.910267e-03  0.0286127377
## emailer_for_promotion  0.0133984674  0.013664481  1.990718e-03 -0.0008288659
## homepage_featured      0.0163515596 -0.005037491  3.102039e-03 -0.0082526328
## num_orders             0.0105749006 -0.053136539  5.177228e-04 -0.0170555268
## city_code             -0.0031975495  0.061077510 -3.861266e-04  0.0004040175
## region_code           -0.0016619278 -0.003426370 -6.480134e-04  0.0045993911
## op_area               -0.0015472583 -0.111867232 -9.263352e-05  0.0015530210
##                       checkout_price   base_price emailer_for_promotion
## meal_id                  0.010749058  0.002605423          0.0133984674
## center_id                0.001345637  0.000603923          0.0136644812
## id                       0.001944697  0.002910267          0.0019907183
## week                     0.026577405  0.028612738         -0.0008288659
## checkout_price           1.000000000  0.953389824          0.0048260946
## base_price               0.953389824  1.000000000          0.1711766436
## emailer_for_promotion    0.004826095  0.171176644          1.0000000000
## homepage_featured       -0.057177959  0.057158546          0.3905210374
## num_orders              -0.283103314 -0.223173519          0.2778266152
## city_code               -0.004805222 -0.002054225         -0.0052325861
## region_code             -0.003648476 -0.001934504         -0.0074615272
## op_area                  0.021571634  0.018031715         -0.0194693973
##                       homepage_featured    num_orders     city_code
## meal_id                     0.016351560  0.0105749006 -0.0031975495
## center_id                  -0.005037491 -0.0531365394  0.0610775099
## id                          0.003102039  0.0005177228 -0.0003861266
## week                       -0.008252633 -0.0170555268  0.0004040175
## checkout_price             -0.057177959 -0.2831033139 -0.0048052224
## base_price                  0.057158546 -0.2231735189 -0.0020542250
## emailer_for_promotion       0.390521037  0.2778266152 -0.0052325861
## homepage_featured           1.000000000  0.2953075422  0.0086416326
## num_orders                  0.295307542  1.0000000000  0.0417882255
## city_code                   0.008641633  0.0417882255  1.0000000000
## region_code                 0.003605678  0.0298695703  0.0426857134
## op_area                     0.041492872  0.1775709033  0.1314765884
##                         region_code       op_area
## meal_id               -0.0016619278 -1.547258e-03
## center_id             -0.0034263704 -1.118672e-01
## id                    -0.0006480134 -9.263352e-05
## week                   0.0045993911  1.553021e-03
## checkout_price        -0.0036484756  2.157163e-02
## base_price            -0.0019345044  1.803171e-02
## emailer_for_promotion -0.0074615272 -1.946940e-02
## homepage_featured      0.0036056776  4.149287e-02
## num_orders             0.0298695703  1.775709e-01
## city_code              0.0426857134  1.314766e-01
## region_code            1.0000000000  2.332746e-02
## op_area                0.0233274638  1.000000e+00
corrplot(M,method="circle")

##Building Random Forest ML Model

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
rf<-randomForest(num_orders~meal_id+checkout_price+base_price+emailer_for_promotion+homepage_featured+center_type+cuisine,data=trainf,ntree=50)
rf
## 
## Call:
##  randomForest(formula = num_orders ~ meal_id + checkout_price +      base_price + emailer_for_promotion + homepage_featured +      center_type + cuisine, data = trainf, ntree = 50) 
##                Type of random forest: regression
##                      Number of trees: 50
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 54996.51
##                     % Var explained: 64.63
summary(rf)
##                 Length Class  Mode     
## call                 4 -none- call     
## type                 1 -none- character
## predicted       456547 -none- numeric  
## mse                 50 -none- numeric  
## rsq                 50 -none- numeric  
## oob.times       456547 -none- numeric  
## importance           7 -none- numeric  
## importanceSD         0 -none- NULL     
## localImportance      0 -none- NULL     
## proximity            0 -none- NULL     
## ntree                1 -none- numeric  
## mtry                 1 -none- numeric  
## forest              11 -none- list     
## coefs                0 -none- NULL     
## y               456547 -none- numeric  
## test                 0 -none- NULL     
## inbag                0 -none- NULL     
## terms                3 terms  call

###Splitting the test data into Public and Private

Private.index <- sample(c(1:dim(testfull)[1]), dim(testfull)[1]*0.7)
Public.index <- setdiff(c(1:dim(testfull)[1]), Private.index)
Privatet.df <- testfull[Private.index, ]
Public.df <- testfull[Public.index, ]

##Predicting on Train Data

library(dplyr)

trainf$predRF<-predict(rf,data=trainf)
head(select(trainf,c(predRF,num_orders)))
rmsle<- sqrt(1/length(trainf$num_orders)*sum((log(trainf$predRF +1)-log(trainf$num_orders +1))^2))

##Copying the Predicted values of test data to csv

predict_test <-predict(rf,newda=testfull)
Public_predict<-data.frame(testfull$id,num_orders=predict_test)
write.csv(Public_predict,"Public_predict_1.csv",row.names = F)